/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.quality; import java.io.*; import java.util.*; import net.nutch.searcher.*; import net.nutch.quality.dynamic.*; /****************************************** * The QueryGenerator will process a list of * query terms, and apply them to a search engine. * Using a sherlock-format description of the * HTML output, we'll then extract the results. * We emit the query/result set in a format that * can be used by ResultTestTool * * @author Mike Cafarella ******************************************/ public class ResultSetGenerator { PageExtractor.IExtractor extractor; boolean debug = false; /** * A query generator needs to know about the item * it is interrogating. */ public ResultSetGenerator(PageExtractor.IExtractor extractor, boolean debug) throws IOException, ParseException { this.extractor = extractor; this.debug = debug; } /** * Iterate through all the queries. Obtain results for each * one, and write the results out to the indicated file. */ public void processQueries(File queryList, File outputResultSet) throws IOException { // Load in the query terms Vector queries = new Vector(); BufferedReader in = new BufferedReader(new FileReader(queryList)); try { String term = in.readLine(); while (term != null) { queries.add(term.trim()); term = in.readLine(); } } finally { in.close(); } // Output the search results DataOutputStream out = new DataOutputStream(new FileOutputStream(outputResultSet)); try { out.writeInt(queries.size()); for (Enumeration e = queries.elements(); e.hasMoreElements(); ) { String term = (String) e.nextElement(); ArrayList results = null; try { results = extractor.applyQuery(term); } catch (IOException ie) { System.err.println("Could not extract results for " + term); } int numResults = 0; if (results != null) { numResults = Math.min(10, results.size()); } out.writeUTF(term); out.writeInt(numResults); if (debug) { System.out.println("For "+ term + ", " + numResults); } for (int i = 0; i < numResults; i++) { String str = (String) results.get(i); out.writeUTF(str); } } } finally { out.close(); } } /** * Give a set of queries, and generate a set of responses from the * given query target */ public static void main(String argv[]) throws IOException, ParseException { if (argv.length < 4) { System.out.println("Usage: java net.nutch.quality.ResultSetGenerator [-externalengine <pageDesc> <userAgent>] [-nutchengine <segments>] <queryList> <outputResultSet> [-debug]"); return; } int pos = argv.length; boolean debug = false; String pageDesc = null, userAgent = null, segments = null, queryList = null, outputSet = null; // Parse command if ("-externalengine".equals(argv[0])) { pageDesc = argv[1]; userAgent = argv[2]; pos = 3; } else if ("-nutchengine".equals(argv[0])) { segments = argv[1]; pos = 2; } else { System.out.println("Must use command -externalengine or -nutchengine"); return; } // Get rest of args queryList = argv[pos++]; outputSet = argv[pos++]; if (argv.length > pos && "-debug".equals(argv[pos])) { debug = true; } // Prepare the extractor PageExtractor.IExtractor extractor = null; if ("-externalengine".equals(argv[0])) { extractor = new PageExtractor.RemotePageExtractor(new File(pageDesc), userAgent, debug); } else if ("-nutchengine".equals(argv[0])) { extractor = new PageExtractor.NutchExtractor(segments); } // Extract the results! ResultSetGenerator rsg = new ResultSetGenerator(extractor, true); rsg.processQueries(new File(queryList), new File(outputSet)); } }